16S rRNA amplicon sequencing QC

Author

Laura Symul

Published

May 27, 2025

Caution

This document only perform minimal QC at the moment - a lot of the upstream QC has been performed by Joseph Elsherbini and Johnathan Shih in Kwon’s lab and Asavela at CAPRISA.

The main purpose of this document is to summarize at the species level, harmonize the data with the CRF data, and to prepare the data so that it can be integrated with other assays for downstream analyses.

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(phyloseq)
library(mia)


# tmp <- fs::dir_map("R scripts mg/", source)
tmp <- fs::dir_map("R scripts/", source)

theme_set(theme_light())

Loading the data

Code
data_source <- "real"

# TODO: change so it runs for different users.
VIBRANT_dropbox_dir <- 
  "/Users/laurasymul/Dropbox/Academia/Projects/VIBRANT Study Files/"


ps <- 
  readRDS(str_c(VIBRANT_dropbox_dir,"12_VIBRANT 16S/raw_merged_all_pools_ps_20250512.rds"))

ps
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 282710 taxa and 4362 samples ]
sample_data() Sample Data:       [ 4362 samples by 28 sample variables ]
tax_table()   Taxonomy Table:    [ 282710 taxa by 7 taxonomic ranks ]

Sample data:

Code
ps@sam_data |> 
  as.data.frame() |> 
  as_tibble() |> 
  glimpse()
Rows: 4,362
Columns: 28
$ sampleID                                  <chr> "2152449_068200035_0000", "2…
$ Well.ID                                   <chr> "B01", "C01", "D01", "E01", …
$ Sample.ID.Barcode                         <chr> "2152449", "2154015", "21540…
$ Patient.ID                                <chr> "68200035", "68200040", "682…
$ Visit.code                                <chr> "0", "0", "1200", "1000", "1…
$ IDT.adapter.name                          <chr> "IDT10_UDI_Adapter2", "IDT10…
$ i5.index.forward                          <chr> "CGTAGAACAG", "ACCTAAGAGC", …
$ i5.index.reverse                          <chr> "CTGTTCTACG", "GCTCTTAGGT", …
$ i7.index.reverse                          <chr> "ATAAGCGGAG", "CGACCTTAAT", …
$ i7.index                                  <chr> "CTCCGCTTAT", "ATTAAGGTCG", …
$ sample_id                                 <chr> "2152449_068200035_0000", "2…
$ spike_in_Allobacillus_halotolerans_rel    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ spike_in_Imtechella_halotolerans_rel      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ spike_in_Allobacillus_halotolerans_counts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ spike_in_Imtechella_halotolerans_counts   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ total_spike_in_rel                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ total_spike_in_counts                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ CST                                       <chr> "III", "IV-B", "IV-B", "IV-B…
$ subCST                                    <chr> "III-B", "IV-B", "IV-B", "IV…
$ score                                     <dbl> 0.9200522, 0.6396295, 0.7183…
$ pool                                      <chr> "pool1", "pool1", "pool1", "…
$ Specimen.type                             <chr> "Vaginal Swab", "Vaginal Swa…
$ Volume.pooled..uL.                        <dbl> 3, 3, 3, 3, 3, 3, 3, 3, 3, 3…
$ Researcher                                <chr> "Jiawu", "Jiawu", "Jiawu", "…
$ Description                               <chr> "VIBRANT", "VIBRANT", "VIBRA…
$ Pool10                                    <chr> NA, NA, NA, NA, NA, NA, NA, …
$ Original.MGH.aliquot.Unique.identifier    <dbl> NA, NA, NA, NA, NA, NA, NA, …
$ Site                                      <chr> NA, NA, NA, NA, NA, NA, NA, …

Taxonomic table:

Code
ps@tax_table |> as.data.frame() |> as_tibble() |> glimpse()
Rows: 282,710
Columns: 7
$ Domain  <chr> "d_Bacteria", "d_Bacteria", "d_Bacteria", "d_Bacteria", "d_Bac…
$ Phylum  <chr> "p_Bacillota", "p_Bacillota_A", "p_Bacillota", "p_Bacillota", …
$ Class   <chr> "c_Bacilli", "c_Clostridia", "c_Bacilli", "c_Bacilli", "c_Cori…
$ Order   <chr> "o_Lactobacillales", "o_Lachnospirales", "o_Lactobacillales", …
$ Family  <chr> "f_Lactobacillaceae", "f_Lachnospiraceae", "f_Lactobacillaceae…
$ Genus   <chr> "g_Lactobacillus", "g_UBA629", "g_Lactobacillus", "g_Lactobaci…
$ Species <chr> "Lactobacillus_crispatus", "Ca_Lachnocurva_vaginae", "Lactobac…

Data cleaning and harmonization

Dimensions and sparsity

The phyloseq object contains counts for 4362 samples and 282710 ASVs.

Code
counts <- ps@otu_table |> as.matrix()
prop_0 <- mean(counts == 0)

This is a very large number of ASVs and most of them are not present in most samples as the proportion of 0s in the count table is 99.93%.

Code
ASV_stats <- 
  tibble(
    mean_counts = colMeans(counts),
    n_non_zero = colSums(counts > 0),
    f_non_zero = n_non_zero / nrow(counts),
    max_counts = apply(counts, 2, max)
  )
Code
ASV_stats |> ggplot() + aes(x = mean_counts) + geom_histogram(bins = 30) +
  scale_x_log10(n.breaks = 10) +
  labs(x = "Mean counts (log scale)", y = "Number of ASVs") +
  
  
ASV_stats |> ggplot() + aes(x = max_counts) + geom_histogram(bins = 30) +
    scale_x_log10(n.breaks = 8) +
  labs(x = "Max number of counts (log scale)", y = "Number of ASVs") +
  
ASV_stats |> ggplot() + aes(x = f_non_zero) + geom_histogram(bins = 30) +
    scale_x_log10(n.breaks = 6) +
  labs(x = "Proportion of samples with non-zero counts(log scale)", y = "Number of ASVs") +
  
  plot_layout(nrow = 2)

93.11% of ASVs are present (non-zero counts) in only 1 sample.

Code
rm(counts, ASV_stats)

Creation a of SummarizedExperiment object

To facilitate data harmonization and cleaning, we transform the phyloseq object to a SummarizedExperiment object summarized at the species level to reduce the size of the object.

Code
# throws an error
ps_agg <- ps |> tax_glom("Species") 
Note

At a later stage, we could keep the full ASV counts, but at this stage, for most analyses, the species level is sufficient.

Code
make_se_from_ps <- function(ps) {
  
  # ROWDATA (Tax table)
  asv_tax_table <- 
    ps@tax_table |> 
    as.data.frame() |>
    rownames_to_column("ASV_sequence") |> 
    mutate(ASV_nb = row_number()) |>
    group_by(Species) |> 
    mutate(ASV_id = str_c(Species, " (ASV", ASV_nb |> str_pad(width = 6, pad = "0"),")")) |> 
    ungroup() |> 
    mutate(rownames = ASV_id) |> 
    column_to_rownames("rownames") |> 
    relocate(ASV_sequence, .after = ASV_id)  |> 
    janitor::clean_names()
  
  # (asv_tax_table$species |> unique() |> length()) == (asv_tax_table |> dplyr::count(domain, phylum, class, order, family, genus, species) |> nrow())
  
  se_rowdata <- 
    asv_tax_table |> 
    group_by(domain, phylum, class, order, family, genus, species) |> 
    summarize(
      n_asv = n(),
      asv_ids = str_c(asv_id, collapse = ", "),
      .groups = "drop"
    )
  
  # COLDATA (Sample data)
  se_coldata <-
    ps@sam_data |> 
    as.data.frame() |> 
    as_tibble() |>  # weird trick because of the sticky `phyloseq` attribute
    as.data.frame() |> 
    set_rownames(rownames(ps@sam_data))
  
  coldata_dictionary <- 
    tibble(
      original_name = se_coldata |> colnames()
    )
  
  se_coldata <- se_coldata |> janitor::clean_names()
  se_coldata <- se_coldata |> dplyr::rename(visit_code_ps = visit_code)
  
  coldata_dictionary <- 
    coldata_dictionary |> bind_cols(name = colnames(se_coldata))
  
  se_coldata <- se_coldata |> mutate(sample_id_16s = rownames(se_coldata)) 
   

  # ASSSAY
  raw_counts <- ps@otu_table |> as.matrix()
  class(raw_counts) <- "matrix"
  raw_counts <- raw_counts |> t()
  rownames(raw_counts) <- asv_tax_table$asv_id
  
  tmp <- 
    raw_counts |> 
    as.data.frame() |>
    mutate(species = asv_tax_table$species) 
  
  counts_assay <- 
    tmp |> 
    group_by(species) |> 
    summarize(across(everything(), sum)) |> 
    column_to_rownames("species")
  
  # SummarizedExperiment
  
  se <- SummarizedExperiment(
    assays = list(
      counts = counts_assay,
      rel_ab = t(t(counts_assay) / colSums(counts_assay))
    ),
    rowData = se_rowdata,
    colData = se_coldata,
    metadata = list(
      name = "VIBRANT 16S rRNA amplicon sequencing",
      data = today(),
      coldata_dictionary = coldata_dictionary,
      asv_tax_table = asv_tax_table
    )
  )
  
  
  # reordering taxa
  taxa_order <- 
    se |> as_tibble() |> 
    group_by(.feature) |> summarize(mean_rel_ab = mean(rel_ab)) |> 
    arrange(-mean_rel_ab) |> pull(.feature)
  
  se <- se[taxa_order, ]
  
  
  se
}
Code
se_16S <- make_se_from_ps(ps)
Code
se_16S
# A SummarizedExperiment-tibble abstraction: 3,563,754 × 42
# Features=817 | Samples=4362 | Assays=counts, rel_ab
   .feature           .sample counts  rel_ab sample_id well_id sample_id_barcode
   <chr>              <chr>    <dbl>   <dbl> <chr>     <chr>   <chr>            
 1 Lactobacillus_ine… 215244…  65218 7.21e-1 2152449_… B01     2152449          
 2 Lactobacillus_cri… 215244…   2569 2.84e-2 2152449_… B01     2152449          
 3 Gardnerella_vagin… 215244…  12469 1.38e-1 2152449_… B01     2152449          
 4 Ca_Lachnocurva_va… 215244…     25 2.76e-4 2152449_… B01     2152449          
 5 Sneathia_vaginalis 215244…      0 0       2152449_… B01     2152449          
 6 Fannyhessea_vagin… 215244…    871 9.63e-3 2152449_… B01     2152449          
 7 Sneathia_sanguine… 215244…      6 6.63e-5 2152449_… B01     2152449          
 8 Prevotella_bivia   215244…    293 3.24e-3 2152449_… B01     2152449          
 9 Prevotella_amnii   215244…      0 0       2152449_… B01     2152449          
10 f_Dethiosulfovibr… 215244…      0 0       2152449_… B01     2152449          
# ℹ 40 more rows
# ℹ 35 more variables: patient_id <chr>, visit_code_ps <chr>,
#   idt_adapter_name <chr>, i5_index_forward <chr>, i5_index_reverse <chr>,
#   i7_index_reverse <chr>, i7_index <chr>, sample_id_2 <chr>,
#   spike_in_allobacillus_halotolerans_rel <dbl>,
#   spike_in_imtechella_halotolerans_rel <dbl>,
#   spike_in_allobacillus_halotolerans_counts <dbl>, …

We renamed the columns in the sample data:

Code
se_16S@metadata$coldata_dictionary |> gt()
original_name name
sampleID sample_id
Well.ID well_id
Sample.ID.Barcode sample_id_barcode
Patient.ID patient_id
Visit.code visit_code_ps
IDT.adapter.name idt_adapter_name
i5.index.forward i5_index_forward
i5.index.reverse i5_index_reverse
i7.index.reverse i7_index_reverse
i7.index i7_index
sample_id sample_id_2
spike_in_Allobacillus_halotolerans_rel spike_in_allobacillus_halotolerans_rel
spike_in_Imtechella_halotolerans_rel spike_in_imtechella_halotolerans_rel
spike_in_Allobacillus_halotolerans_counts spike_in_allobacillus_halotolerans_counts
spike_in_Imtechella_halotolerans_counts spike_in_imtechella_halotolerans_counts
total_spike_in_rel total_spike_in_rel
total_spike_in_counts total_spike_in_counts
CST cst
subCST sub_cst
score score
pool pool
Specimen.type specimen_type
Volume.pooled..uL. volume_pooled_u_l
Researcher researcher
Description description
Pool10 pool10
Original.MGH.aliquot.Unique.identifier original_mgh_aliquot_unique_identifier
Site site

Harmonization of sample and participant IDs

We have data for the following samples:

Code
se_16S@colData |> 
  as.data.frame() |> 
  ggplot() +
  aes(x = patient_id, y = visit_code_ps) +
  geom_point() +
  facet_grid(. ~ site, scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  )

We see that we’ll have to harmonize the patient_id and visit_code columns. From harmonized participant ID, we’ll be able to impute the site location.

Code
se_16S@colData <- 
  se_16S@colData |> 
  as.data.frame() |> 
  mutate(
    pid = 
      patient_id |> 
      str_remove_all("_") |> 
      str_remove_all("-") |> 
      str_replace_all("^68","068") 
  ) |> 
  DataFrame()
Code
se_16S@colData |> 
  as.data.frame() |> 
  mutate(pid_length = str_length(pid)) |> 
  filter(pid_length != 9) |> 
  dplyr::count(patient_id, pid) |> 
  gt(
    caption = "Samples with non-standard participant IDs"
  )
Samples with non-standard participant IDs
patient_id pid n
98120009 98120009 7
Amplification NEG control Amplification NEG control 26
Amplification POS control Amplification POS control 41
Mock 1 Mock 1 44
Mock 2 Mock 2 46
Nuclease-free water Nucleasefree water 5
Nuclease_free water Nucleasefree water 29
Unused swab_C2 Unused swabC2 34
Note

As in the metagenomics, there were 7 samples from a participant from another study (starting by (0)98).

The rest looks legit.

We next clean the visit_code_ps

Code
 se_16S@colData |> 
  as.data.frame() |> 
  dplyr::count(visit_code_ps)
Code
se_16S@colData <- 
  se_16S@colData |> 
  as.data.frame() |> 
  mutate(
    visit_code = 
      visit_code_ps |> 
      str_replace("V7", "1700") |> 
      str_replace("V8", "1900") |>
      str_replace("V9", "2120") |> 
      str_pad(width = 4, pad = "0") 
  ) |> 
  DataFrame()

We also create new columns for the sample_type and control_type

Code
se_16S@colData <- 
  se_16S@colData |> 
  as.data.frame() |> 
  mutate(
    sample_type = 
      case_when(
        str_detect(pid, "^068") ~ "Clinical sample",
        str_detect(pid, "98") ~ "Clinical sample (other study)",
        str_detect(pid, "Amplification NEG control") | str_detect(sample_id, "Amplification_NEG_control") ~ "Amplification negative control",
        str_detect(pid, "Amplification POS control") | str_detect(sample_id, "Amplification_POS_control") ~ "Amplification positive control",
        str_detect(pid, "Mock") | str_detect(sample_id, "Mock") ~ "Positive control",
        str_detect(pid, "water") | str_detect(pid, "Unused") | str_detect(sample_id, "water") | str_detect(sample_id, "Unused")  ~ "Negative control",
      ),
    control_type = 
      case_when(
        str_detect(sample_type, "Clinical") ~ "",
        str_detect(sample_id, "Mock1") ~ "Mock 1",
        str_detect(pid, "Mock") ~ pid,
        str_detect(pid, "water") | str_detect(sample_id, "water") ~ "Nuclease-free water",
        str_detect(pid, "Unused") | str_detect(sample_id, "Unused")  ~ "Unused swab + C2",
        TRUE ~ sample_type
      )
  ) |> 
  DataFrame()
Code
se_16S@colData |> 
  as.data.frame() |> 
  dplyr::count(sample_type, control_type) |> 
  gt()
sample_type control_type n
Amplification negative control Amplification negative control 28
Amplification positive control Amplification positive control 46
Clinical sample 4120
Clinical sample (other study) 7
Negative control Nuclease-free water 35
Negative control Unused swab + C2 35
Positive control Mock 1 45
Positive control Mock 2 46
Code
se_16S$pid <- ifelse(!(se_16S$sample_type %in% c("Clinical sample", "Clinical sample (other study)")), NA_character_, se_16S$pid)

We also create a column ext_lib_plate that contains the extraction library plate (from Sinaye) so that we’ll be able to compare the controls between 16S and metagenomics (the barcodes do not seem to match those provided by Michael - there, the controls barcode start with “EQ”)

Note

Does that “EQ”-starting barcodes sound familiar or not at all?

TODO

Number of observation per pid and visit_code

After harmonizing the pid and visit_code, we notice that a number of participants have two samples per visit.

All of these “replicates” are from “pool10” which, I assume, was a bunch of samples that had to be re-sequenced.

Code
replicates <- 
  se_16S@colData |> 
  as.data.frame() |> 
  as_tibble() |> 
  filter(sample_type == "Clinical sample")

replicates |> 
  dplyr::count(pid, visit_code, name = "n samples per participant x visit") |> 
  dplyr::count(`n samples per participant x visit`, name = "occurence") |> 
  gt()
n samples per participant x visit occurence
1 3784
2 168
Code
# se_16S@colData |> 
#   as.data.frame() |> 
#   as_tibble() |> 
#   filter(sample_type == "Clinical sample") |> 
#   dplyr::count(patient_id, visit_code_ps) |> 
#   dplyr::count(n)


replicates <- 
  replicates |>
  group_by(pid, visit_code) |> arrange(visit_code) |>  mutate(n = n(), rep_nb = row_number()) |> ungroup() |> 
  filter(n > 1) |> 
  select(sample_id, patient_id, pid, visit_code_ps, visit_code, rep_nb, pool) |> 
  arrange(pid, visit_code_ps) 

replicates |> 
  dplyr::count(rep_nb, pool) |> 
  gt()
rep_nb pool n
1 pool1 29
1 pool2 26
1 pool3 2
1 pool5 73
1 pool6 19
1 pool7 3
1 pool8 14
1 pool9 2
2 pool10 168

We’ll do a comparison of the composition of these samples below.

Code
se_16S@colData$resequenced <- "No"
se_16S@colData$resequenced[se_16S@colData$sample_id %in% replicates$sample_id[replicates$pool != "pool10"]] <- "Yes"
se_16S@colData$resequenced[se_16S@colData$sample_id %in% replicates$sample_id[replicates$pool == "pool10"]] <- "2nd sequencing"

se_16S@colData$resequenced <- se_16S@colData$resequenced |> factor(levels = c("No", "Yes", "2nd sequencing"))

Matching pid and visit_code with the CRF data

Caution

Below, I load a list of all visits for all participants as found in the CRF data - this is NOT based on CRF-based specimen collection data.

TODO: change so that it uses the “expected” data based on the weekly specimen collection data (CRF 35) and the home swab collection data (CRF 47).

Code
load("/Users/laurasymul/OneDrive - UCL/Academia/Research/VIBRANT clinical data UCLouvain/Data processed/2025-05-14/visits_summary.RData", verbose = TRUE)
Loading objects:
  v
Code
v <- v |> dplyr::rename(visit_code_crf = visit_code) |> mutate(pid = str_c("068", pid))
Code
v <- 
  v |> 
  mutate(visit_code = visit_code_crf |> as.character() |> str_pad(width = 4, pad = "0")) 
Code
v |> 
  ggplot() +
  aes(x = pid, y = visit_code, col = randomized) +
  geom_point() +
  facet_grid(. ~ location + randomized, scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) + 
  ggtitle("CRF data")
Code
matched <- 
  dplyr::full_join(
    se_16S@colData |> 
      as.data.frame() |> 
      filter(sample_type == "Clinical sample") |> 
      as_tibble() |> 
      select(pid, visit_code, patient_id, visit_code_ps, sample_id, site) |> 
      mutate(has_16S_data = TRUE),
    v |> select(pid, visit_code) |>  mutate(has_CRF_data = TRUE),
    by = join_by(pid, visit_code)
  ) |> 
  mutate(
    has_16S_data = has_16S_data |> replace_na(FALSE),
    has_CRF_data = has_CRF_data |> replace_na(FALSE)
  ) |> 
  dplyr::left_join(
    v |> select(pid, location, met_eligibility, randomized) |> distinct() |> mutate(pid_in_CRFs = TRUE),
    by = join_by(pid)
  ) |> 
  mutate(
    pid_in_CRFs = pid_in_CRFs |> replace_na(FALSE)
  )
Code
matched |> 
  dplyr::count(has_16S_data, has_CRF_data, pid_in_CRFs, randomized, name = "n entries") |> 
  gt()
has_16S_data has_CRF_data pid_in_CRFs randomized n entries
FALSE TRUE TRUE FALSE 414
FALSE TRUE TRUE TRUE 201
TRUE FALSE FALSE NA 9
TRUE FALSE TRUE FALSE 3
TRUE FALSE TRUE TRUE 110
TRUE TRUE TRUE FALSE 75
TRUE TRUE TRUE TRUE 3923
Code
matched |> 
  ggplot() +
  aes(x = pid, y = visit_code, col = has_16S_data, shape = has_CRF_data) +
  geom_point() +
  facet_grid(. ~ location + ifelse(randomized,"Randomized", "Not randomized"), scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) + 
  scale_shape_manual(values = c(1, 16)) +
  ggtitle("matched data")

Let’s focus on

  • “Clinical samples” in the 16S data that we do not have in the CRFs
  • Randomized participants with non-matching visits in the 16S data.

Participant ID not in the CRFs

Code
matched |> 
  filter(is.na(location)) |> 
  dplyr::count(pid, patient_id) |> 
  gt()
pid patient_id n
0681000xx 068_10_00xx 1
068200000 068200000 3
068200000 68200000 5

These are likely “test” participants.

Randomized participants with non-matching visits in the 16S data

Code
matched |> 
  filter(randomized) |> 
   ggplot() +
  aes(x = pid, y = visit_code, col = has_16S_data, shape = has_CRF_data) +
  geom_point(size = 1.5) +
  facet_grid(. ~ location + ifelse(randomized,"Randomized", "Not randomized"), scales = "free", space = "free") +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
  ) + 
  scale_shape_manual(values = c(1, 16)) +
  ggtitle("matched data")

We have a quite ok match. Some observations around the mismatches:

  • We see a few examples of likely mis-labelled visits (e.g., around the screening visits)

  • and a few “missing” 16S at visits where swabs are not expected (e.g. 2121 or 1511).

  • One SA participant has a whole week of missed swabs, but another week without crfs but with swabs.

  • A few US participants were supposed to collect swabs but likely did not (to check once the “daily specimen collection CRF” is cleaned)

Overall, it looks fine.

Note

We should probably review this with the team once the CRF data have been cleaned and QCed.

Exploratory & QC analyses

Total number of counts per sample.

Code
se_16S@colData$total_counts <- colSums(assay(se_16S, "counts"))
Code
se_16S@colData |> 
  as.data.frame() |> 
  ggplot() +
  aes(x = total_counts, fill = control_type) +
  geom_histogram(bins = 30) +
  scale_x_log10(n.breaks = 8) +
  labs(x = "Total counts (log scale)", y = "Nb of samples") +
  facet_grid(sample_type ~ ., scales = "free") +
  theme(strip.text.y = element_text(angle = 0))

Code
se_16S@colData |> 
  as.data.frame() |> 
  filter(sample_type == "Clinical sample") |> 
  ggplot() +
  aes(x = resequenced, y = total_counts, fill = resequenced) +
  # geom_histogram(bins = 30) +
  geom_violin(alpha = 0.3, col = "transparent", scale = "width") +
  ggbeeswarm::geom_quasirandom(alpha = 0.5, size = 0.1) +
  geom_path(aes(group = interaction(pid, visit_code)), alpha = 0.1) +
  scale_y_log10(n.breaks = 8) +
  guides(fill = "none") +
  labs(x = "Re-sequenced", y = "Total counts (log scale)") +
  ggtitle("Clinical samples only")

Code
tmp <- 
  se_16S@colData |> 
  as.data.frame() |> 
  filter(!is.na(visit_code)) |> 
  group_by(visit_code) |> mutate(n = n()) |> ungroup() |> 
  filter(n > 20) |> 
  mutate(visit_code = visit_code |> factor()) |> 
  arrange(pid, visit_code)

tmp |> 
  ggplot() +
  aes(y = total_counts, x = visit_code, fill = visit_code, col = visit_code) +
  # geom_path(aes(group = pid), alpha = 0.15) +
  geom_violin(draw_quantiles = 0.5, alpha = 0.5) + # fill = "gray", 
  scale_y_log10(n.breaks = 10) +
  guides(fill = "none", col = "none") +
  labs(y = "Total counts (log scale)", x = "Visit") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5))

Controls

Negative controls

Code
tmp <- 
  se_16S |> 
  filter(sample_type == "Negative control") |> 
  as_tibble() |> 
  group_by(.feature) |>
  mutate(max_rel_ab = max(rel_ab)) |> 
  ungroup() |> 
  filter(max_rel_ab > 0.1)

tmp |> 
  ggplot() +
  aes(x = sample_id, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(. ~ control_type, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Amplification negative controls

Code
tmp <- 
  se_16S |> 
  filter(sample_type == "Amplification negative control") |> 
  as_tibble() |> 
  group_by(.feature) |>
  mutate(max_rel_ab = max(rel_ab)) |> 
  ungroup() |> 
  filter(max_rel_ab > 0.2)

tmp |> 
  ggplot() +
  aes(x = sample_id, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(. ~ control_type, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Amplification positive controls

Code
tmp <- 
  se_16S |> 
  filter(sample_type == "Amplification positive control") |> 
  as_tibble() |> 
  group_by(.feature) |>
  mutate(max_rel_ab = max(rel_ab)) |> 
  ungroup() |> 
  filter(max_rel_ab > 0.1)

tmp |> 
  ggplot() +
  aes(x = sample_id, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(. ~ control_type, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Positive controls

Code
tmp <- 
  se_16S |> 
  filter(sample_type == "Positive control") |> 
  as_tibble() |> 
  group_by(.feature) |>
  mutate(max_rel_ab = max(rel_ab)) |> 
  ungroup() |> 
  filter(max_rel_ab > 0.01)

tmp |> 
  ggplot() +
  aes(x = sample_id, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(. ~ control_type, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

That gives us an idea of the replicability and expected variability :)

Replicates

Code
tmp <- 
  se_16S |> 
  as_tibble() |> 
  filter(sample_type == "Clinical sample") |> 
  group_by(pid, visit_code) |> mutate(n = sample_id |> unique() |> length()) |> ungroup() |>
  filter(n > 1) |> 
  filter(.feature %in% .feature[1:20])
  # group_by(.feature) |>
  # mutate(max_rel_ab = max(rel_ab)) |> 
  # ungroup() |> 
  # filter(max_rel_ab > 0.1)

tmp |> 
  ggplot() +
  aes(x = pool |> parse_number() |> factor(), y = rel_ab, fill = .feature, alpha = total_counts |> log10()) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", tmp$.feature |> unique() |> length(), " taxa (based on max rel. ab.)"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_wrap(. ~ pid + visit_code, scales = "free_x", nrow = 3) +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    ) +
  xlab("pool")

Relatively good agreement between the replicates; for the longitudinal profiles below, when we have replicates, we’ll use the replicate that had the most total reads.

Caution

@Joseph & co: is that what you intended with the replicates? Or was there a different (better) strategy?

Longitudinal profiles for randomized participants

Code
tmp <- 
  se_16S@colData |> 
  as.data.frame() |> 
  select(-any_of(c("randomized", "location"))) |> 
  dplyr::left_join(
    v |> select(pid, randomized, location) |> distinct(),
    by = join_by(pid)
  )

se_16S@colData$randomized <- tmp$randomized
se_16S@colData$location <- tmp$location
Code
n_taxa <- 20

tmp <- 
  se_16S |> 
  filter(randomized) |> 
  as_tibble() |> 
  filter(.feature %in% .feature[1:n_taxa]) 

selected_replicates <- 
  tmp |> 
  select(sample_id, pid, visit_code, pool, total_counts) |> 
  distinct() |> 
  arrange(pid, visit_code, -total_counts) |> 
  group_by(pid, visit_code) |>
  slice_head(n = 1) |> 
  ungroup() |> 
  group_by(visit_code) |> mutate(n = n()) |> ungroup() |> 
  filter(n > 20)
Code
tmp |> 
  filter(sample_id %in% selected_replicates$sample_id) |>
  ggplot() +
  aes(x = pid, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", n_taxa, " taxa"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(visit_code ~ location, scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Code
tmp |> 
  filter(sample_id %in% selected_replicates$sample_id) |>
  ggplot() +
  aes(x = visit_code, y = rel_ab, fill = .feature) +
  geom_col() +
  scale_fill_manual(
    str_c("Top ", n_taxa, " taxa"), 
    breaks = tmp$.feature |> unique(), 
    values = tmp$.feature |> unique() |> get_taxa_colors()
  ) +
  facet_grid(location + pid ~ ., scales = "free_x", space = "free_x") +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)
    )

Save SummarizedExperiment objects

We save two SE objects to disk:

  • se_16S_raw: one that has all the controls and replicates (current se_16S object)

  • se_16S_agg: one that only has the clinical samples (if replicates, we keep the one with the most total counts) and the positive and negative controls (not the amplification controls).

For se_16S_agg, we add the uid column (the VIBRANT cross-assay unique identifier) to the colData and as the SE “sample id” so that we can merge with the other assays.

TODO: change the “replicate” strategy later if needed.

Code
# We only select the clinical samples and the positive and negative controls
se_16S_agg <- 
  se_16S |> 
  filter(
    sample_type %in% c("Clinical sample", "Positive control", "Negative control")
  ) 

# we pick the samples with the most total counts when there are replicates
selected_samples <- 
  se_16S_agg@colData |>
  as.data.frame() |> 
  as_tibble() |>
  filter(sample_type == "Clinical sample") |>
  select(sample_id, pid, visit_code, pool, total_counts) |> 
  distinct() |> 
  arrange(pid, visit_code, -total_counts) |> 
  group_by(pid, visit_code) |>
  slice_head(n = 1) |> 
  ungroup() |> 
  select(sample_id) |> 
  bind_rows(
    se_16S_agg@colData |>
      as.data.frame() |> 
      as_tibble() |>
      filter(sample_type != "Clinical sample") |> 
      select(sample_id)
  ) |> 
  distinct()
  
se_16S_agg <- se_16S_agg[, selected_samples$sample_id]

# We create the VIBRANT uid

se_16S_agg@colData$uid <- 
  ifelse(
    !is.na(se_16S_agg@colData$pid), 
    str_c(
      se_16S_agg@colData$pid, "_",
      se_16S_agg@colData$visit_code
    ), 
    se_16S_agg@colData$sample_id
  )

# we use these uids as the sample IDs
se_16S_agg <- 
  SummarizedExperiment(
    assays = list(
      counts = assay(se_16S_agg, "counts") |> set_colnames(se_16S_agg$uid),
      rel_ab = assay(se_16S_agg, "rel_ab") |> set_colnames(se_16S_agg$uid)
    ),
    rowData = rowData(se_16S_agg),
    colData = se_16S_agg@colData |> set_rownames(se_16S_agg$uid),
    metadata = se_16S_agg@metadata
  )
Code
se_16S_agg
# A SummarizedExperiment-tibble abstraction: 3,360,321 × 51
# Features=817 | Samples=4113 | Assays=counts, rel_ab
   feature   sample counts rel_ab sample_id well_id sample_id_barcode patient_id
   <chr>     <chr>   <dbl>  <dbl> <chr>     <chr>   <chr>             <chr>     
 1 Lactobac… 06810…   6904 0.0933 202_068_… A01     202               068_10_00…
 2 Lactobac… 06810…      0 0      202_068_… A01     202               068_10_00…
 3 Gardnere… 06810…   8284 0.112  202_068_… A01     202               068_10_00…
 4 Ca_Lachn… 06810…      0 0      202_068_… A01     202               068_10_00…
 5 Sneathia… 06810…  21143 0.286  202_068_… A01     202               068_10_00…
 6 Fannyhes… 06810…   1601 0.0216 202_068_… A01     202               068_10_00…
 7 Sneathia… 06810…      0 0      202_068_… A01     202               068_10_00…
 8 Prevotel… 06810…   1963 0.0265 202_068_… A01     202               068_10_00…
 9 Prevotel… 06810…      0 0      202_068_… A01     202               068_10_00…
10 f_Dethio… 06810…   3629 0.0490 202_068_… A01     202               068_10_00…
# ℹ 40 more rows
# ℹ 43 more variables: visit_code_ps <chr>, idt_adapter_name <chr>,
#   i5_index_forward <chr>, i5_index_reverse <chr>, i7_index_reverse <chr>,
#   i7_index <chr>, sample_id_2 <chr>,
#   spike_in_allobacillus_halotolerans_rel <dbl>,
#   spike_in_imtechella_halotolerans_rel <dbl>,
#   spike_in_allobacillus_halotolerans_counts <dbl>, …
Code
saveRDS(
  se_16S, 
  str_c(
    get_output_dir(data_source = data_source),  
    "04_se_16S_raw_", today() |> str_remove_all("-"), ".rds"
    )
  )

saveRDS(
  se_16S_agg, 
  str_c(
    get_output_dir(data_source = data_source),  
    "04_se_16S_agg_", today() |> str_remove_all("-"), ".rds"
    )
  )